import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
We get the data from OWID and interpolate/upsample the missing values weekly.
d=pd.read_excel('https://covid.ourworldindata.org/data/owid-covid-data.xlsx')
max_date=max(d['date'])
print(max_date)
2022-10-03
If there is no single row with both Vaccinations and Excess Mortality data we have nothing to work with. So we filter such countries out completely.
country_corr=d.groupby('iso_code')[
['excess_mortality','total_vaccinations_per_hundred']
].corr().unstack().iloc[:,1].reset_index()
country_corr.columns=['iso_code','correlation']
countries_worth_looking_at=list(
country_corr[~country_corr['correlation'].isna()]
.sort_values(
'correlation'
,ascending=False
)['iso_code']
)
country_corr_dict=dict(zip(country_corr.iso_code, country_corr.iso_code + country_corr.correlation.apply( " {0:.2f}".format) ))
dfc = d[(d['iso_code'].isin(countries_worth_looking_at))].copy()
Some countries have the data reported monthly while others have missing data points. We resample everything to the weekly grane and interpolate via backfilling. This way the vaccination time series before the start of reporting are backfilled with a static value (first value in time serie), mostly close to zero. This makes the year 2020 kind of a synthetic control in regard to the vaccination treatment.
# resample needs datetime index
dfc['datetime'] = pd.to_datetime(dfc['date'])
dfc.index = dfc['datetime']
del dfc['datetime']
# we use the mean() function to fill the missing values with the NAs
dfi=dfc.groupby('iso_code').resample('W').mean().reset_index()
dfi.index = pd.to_datetime(dfi['datetime'])
del dfi['datetime']
# interpolation within countries
for i in range(len(dfi.iso_code.unique())):
mask = dfi.loc[:,'iso_code']==dfi.iso_code.unique()[i]
dfi[mask]=dfi[mask].interpolate(method='bfill')
dfi.reset_index(inplace=True)
print('Correlation total_vaccinations_per_hundred and excess_mortality ='
,'\x1b[1;30;30m{:.2f}\x1b[m'.format(
dfi[['excess_mortality', 'total_vaccinations_per_hundred']].corr().unstack()[1]))
Correlation total_vaccinations_per_hundred and excess_mortality = -0.06
At the moment of writing the correlation is close to zero (-0.04). Let's de-average the overal value by calculating the correlation per country.
We chart correlation between excess_mortality and total_vaccinations_per_hundred (and other vaccination metrics) per country to make sure the data is sane.
The Choropleth Charts give an idea how the correlation is distibuted over the globe. May be there is some role the seasonality plays in the resulting in-country corellation.
The Histograms show that the in-country correlation is kinda evenly distributed over the whole [-1:1] interval and there are no visible ouliers.
for metric in ['total_vaccinations_per_hundred'
, 'people_vaccinated_per_hundred'
, 'people_fully_vaccinated_per_hundred'
, 'total_boosters_per_hundred'
, 'new_cases_per_million' ]:
country_corr=dfc.groupby('iso_code')[
['excess_mortality', metric]
].corr().unstack().iloc[:,1].reset_index()
colname = 'correlation excess_mortality and ' + metric
country_corr.columns=['iso_code', colname]
fig = px.choropleth (
country_corr,
locationmode = 'ISO-3',
locations = 'iso_code',
color = colname,
)
fig.update_traces(
showlegend=False
, selector=dict(type='choropleth'))
fig.update_layout(
width=2048,
height=800,
title_text=colname,
geo=dict( showcoastlines=False,),
coloraxis_colorbar=dict( title='Scale, Correlation',),
)
fig.show('png')
fig=ff.create_distplot(
[country_corr.dropna()[colname]]
, [colname]
, bin_size=.2
, histnorm = 'probability'
)
fig.update_layout(
title_text='Country distribution over the correlation [-1:1] and kernel density estimation'
, xaxis_title="Countries over Correlation per Country"
, yaxis_title="Probabililty and Kernel Density Estimation"
, legend_x=0
)
fig.show()
# let's add a helper column for facet titles
dfi['cntr']=dfi['iso_code'].replace(country_corr_dict, inplace=False)
fig = px.line(dfi,
x='datetime',
y=['total_vaccinations_per_hundred','excess_mortality'],
facet_col='cntr',
category_orders={'iso_code':sorted(countries_worth_looking_at)},
facet_col_wrap=4,
facet_row_spacing=0.01,
height=6000, #width=800, # change to fit your output media
labels={
"value": "per hundred / percent",
"variable": "",
},
title="Country Charts and Corr, Time Series Backfilled Weekly, Jan 2020 - "+str(max_date),
)
fig.update_layout(
legend=dict(
yanchor="top",
y=0.999,
xanchor="left",
x=0.01
),
xaxis_title="Weekly Data Jan 2020 - Jan 2022"
)
fig.update_yaxes(range=[-50, 250])
fig.show() # we can't use 'png' here in JupyterLab as the charts get scrambled, a plotly bug?
For each vaccine we split all countries into two groups as per vaccine presence and calculate the correlation within those two group. Obviously there are countries with multiple vaccine present. And some vaccine may only come together in all countries. Yet the results are interesting and non-trivial.
v=pd.read_csv('https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/locations.csv', sep=',', quotechar='"'
, header=0, usecols=['iso_code', 'vaccines'], dtype={'iso_code':'str','vaccines':'str'} )
from sklearn.preprocessing import MultiLabelBinarizer
# convert to contain lists instead of plain text
# TODO: get rid of leading spaces or whatever blanks they are instead of dropping all spaces.
v['vaccines']=v['vaccines'].str.replace(r'[^a-zA-Z,]+',r'').str.split(',')
# break into indicator variables
mlb = MultiLabelBinarizer()
indicators = pd.DataFrame(mlb.fit_transform(v['vaccines']),
columns=mlb.classes_, index=v.index)
vac_list=list(mlb.classes_)
vac_list
['Abdala', 'COVIranBarekat', 'CanSino', 'Corbevax', 'Covaxin', 'EpiVacCorona', 'IMBCAMS', 'JohnsonJohnson', 'KCONVAC', 'KoviVacChumakov', 'Medicago', 'Medigen', 'Moderna', 'Novavax', 'OxfordAstraZeneca', 'PfizerBioNTech', 'QazVac', 'SinopharmBeijing', 'SinopharmWuhan', 'Sinovac', 'Soberana', 'SoberanaPlus', 'SputnikLight', 'SputnikV', 'Turkovac', 'Valneva', 'ZF']
v = pd.concat([v.reset_index(drop=True),
indicators.reset_index(drop=True)], axis=1)
v.set_index('iso_code', drop=True, inplace=True)
dfi=dfi.join(v, on='iso_code')
def get_vac_corr(vac):
vac_corr=dfi.groupby(vac)['excess_mortality','total_vaccinations_per_hundred'].corr()
vac_corr1 = vac_corr.stack().reset_index()
vac_corr1.columns = ['vaccine_presence_ind','b','c','correlation']
vac_corr1.vaccine_presence_ind.replace([0,1],['absent','present'], inplace=True)
vac_corr1['vaccine_name']=vac
corr_per_vac = vac_corr1.loc[
vac_corr1.b.isin(['excess_mortality']) & vac_corr1.c.isin(['total_vaccinations_per_hundred'])
, ['vaccine_name','vaccine_presence_ind','correlation']
]
return corr_per_vac
vac_corrs = pd.concat( get_vac_corr(vac) for vac in vac_list).reset_index()
del vac_corrs['index']
print(vac_corrs)
vaccine_name vaccine_presence_ind correlation 0 Abdala absent -0.068929 1 Abdala present 0.299948 2 COVIranBarekat absent -0.058078 3 COVIranBarekat present -0.418062 4 CanSino absent -0.041384 5 CanSino present -0.176358 6 Corbevax absent -0.062468 7 Covaxin absent -0.050075 8 Covaxin present -0.126664 9 EpiVacCorona absent -0.061845 10 EpiVacCorona present 0.014714 11 IMBCAMS absent -0.062468 12 JohnsonJohnson absent -0.021952 13 JohnsonJohnson present -0.104711 14 KCONVAC absent -0.062468 15 KoviVacChumakov absent -0.062468 16 Medicago absent -0.061983 17 Medicago present -0.077507 18 Medigen absent -0.065765 19 Medigen present 0.598417 20 Moderna absent 0.033353 21 Moderna present -0.100983 22 Novavax absent -0.070997 23 Novavax present 0.032231 24 OxfordAstraZeneca absent 0.059856 25 OxfordAstraZeneca present -0.097692 26 PfizerBioNTech absent 0.119821 27 PfizerBioNTech present -0.073412 28 QazVac absent -0.062543 29 QazVac present 0.230300 30 SinopharmBeijing absent -0.045261 31 SinopharmBeijing present -0.058212 32 SinopharmWuhan absent -0.055567 33 SinopharmWuhan present -0.092859 34 Sinovac absent -0.024292 35 Sinovac present -0.108117 36 Soberana absent -0.064547 37 Soberana present -0.010759 38 SoberanaPlus absent -0.068929 39 SoberanaPlus present 0.299948 40 SputnikLight absent -0.042719 41 SputnikLight present -0.167592 42 SputnikV absent 0.005761 43 SputnikV present -0.133541 44 Turkovac absent -0.062468 45 Valneva absent -0.062255 46 Valneva present 0.160494 47 ZF absent -0.062088 48 ZF present -0.175187
fig = px.bar(vac_corrs
,x='vaccine_name'
,y='correlation'
,color='vaccine_presence_ind'
,color_discrete_sequence=["grey","blue",]
,barmode="group"
,title="""Correlation between excess_mortality and total_vaccinations_per_hundred
<br>in countries grouped by vaccine presence/absence."""
, height=600, width=1000, # change to fit your output media
).update_xaxes(categoryorder ='max descending').show()
Copyright 2021 Abbrivia GmbH https://www.abbrivia.com CC-BY (By Attribution) 4.0 https://creativecommons.org/licenses/by/4.0/legalcode Reuse our work freely!
All visualizations, and code produced in this notebook are completely open access under the Creative Commons BY license. You have the permission to use, distribute, and reproduce these in any medium, provided the source and authors are credited.
The data produced by third parties and made available by "Our World in Data" is subject to the license terms from the original third-party authors. Check the license of any third-party data before use and redistribution on 'https://ourworldindata.org/coronavirus' site (see below).
See the defintions and further discussion on the used dataset at the "Our World in Data" site https://ourworldindata.org/covid-vaccinations
The data is taken specifically from https://covid.ourworldindata.org/data/owid-covid-data.xlsx file
Hannah Ritchie, Edouard Mathieu, Lucas Rodés-Guirao, Cameron Appel, Charlie Giattino, Esteban Ortiz-Ospina, Joe Hasell, Bobbie Macdonald, Diana Beltekian and Max Roser (2020) - "Coronavirus Pandemic (COVID-19)". Published online at OurWorldInData.org. Retrieved from: 'https://ourworldindata.org/coronavirus' [Online Resource]
We use Excel file because it contains the data format information in itself. If you want to run this more often consider manually downloading the data and sourcing it locally as shown in the next line (commented out).